{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "该数据集有100个样本和5个特征\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "dataset_filename = \"affinity_dataset.txt\"\n",
    "X = np.loadtxt(dataset_filename)\n",
    "n_samples, n_features = X.shape\n",
    "print(\"该数据集有{0}个样本和{1}个特征\".format(n_samples, n_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 0. 1. 1. 1.]\n",
      " [1. 1. 0. 1. 0.]\n",
      " [1. 0. 1. 1. 0.]\n",
      " [0. 0. 1. 1. 1.]\n",
      " [0. 1. 0. 0. 1.]]\n"
     ]
    }
   ],
   "source": [
    "print(X[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = [\"bread\", \"milk\", \"cheese\", \"apples\", \"bananas\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In our first example, we will compute the Support and Confidence of the rule \"If a person buys Apples, they also buy Bananas\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "36人买了苹果\n"
     ]
    }
   ],
   "source": [
    "num_apple_purchases = 0\n",
    "for sample in X:\n",
    "    if sample[3] == 1:\n",
    "        num_apple_purchases += 1\n",
    "print(\"{0}人买了苹果\".format(num_apple_purchases))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "21个有效样本\n",
      "15个无效样本\n"
     ]
    }
   ],
   "source": [
    "rule_valid = 0\n",
    "rule_invalid = 0\n",
    "for sample in X:\n",
    "    if sample[3] == 1:  # 这个人买了苹果\n",
    "        if sample[4] == 1:\n",
    "            # 这个人既买了苹果，也买了香蕉\n",
    "            rule_valid += 1\n",
    "        else:\n",
    "            # 这个人既买了苹果，没有买香蕉\n",
    "            rule_invalid += 1\n",
    "print(\"{0}个有效样本\".format(rule_valid))\n",
    "print(\"{0}个无效样本\".format(rule_invalid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "有效规则出现的频率为21，置信度为58.3%.\n"
     ]
    }
   ],
   "source": [
    "frequency = rule_valid  # 有效规则出现的频率\n",
    "confidence = rule_valid / num_apple_purchases\n",
    "print(\"有效规则出现的频率为{0}，置信度为{1:.1f}%.\".format(frequency, 100 * confidence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "valid_rules = defaultdict(int)\n",
    "invalid_rules = defaultdict(int)\n",
    "num_occurences = defaultdict(int)\n",
    "\n",
    "for sample in X:\n",
    "    for productA in range(n_features):\n",
    "        if sample[productA] == 0: continue\n",
    "        num_occurences[productA] += 1\n",
    "        for productB in range(n_features):\n",
    "            if productA == productB: \n",
    "                continue\n",
    "            if sample[productB] == 1:\n",
    "                # 这个人同时买了商品A和商品B\n",
    "                valid_rules[(productA, productB)] += 1\n",
    "            else:\n",
    "                # 这个人同时买了商品A，没有买商品B\n",
    "                invalid_rules[(productA, productB)] += 1\n",
    "frequency = valid_rules\n",
    "confidence = defaultdict(float)\n",
    "for productA, productB in valid_rules.keys():\n",
    "    confidence[(productA, productB)] = valid_rules[(productA, productB)] / num_occurences[productA]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "规则: 如果一个人买了cheese，他们也会买apples\n",
      " - 置信度: 0.610\n",
      " - 频率: 25\n",
      "\n",
      "规则: 如果一个人买了cheese，他们也会买bananas\n",
      " - 置信度: 0.659\n",
      " - 频率: 27\n",
      "\n",
      "规则: 如果一个人买了apples，他们也会买cheese\n",
      " - 置信度: 0.694\n",
      " - 频率: 25\n",
      "\n",
      "规则: 如果一个人买了apples，他们也会买bananas\n",
      " - 置信度: 0.583\n",
      " - 频率: 21\n",
      "\n",
      "规则: 如果一个人买了bananas，他们也会买cheese\n",
      " - 置信度: 0.458\n",
      " - 频率: 27\n",
      "\n",
      "规则: 如果一个人买了bananas，他们也会买apples\n",
      " - 置信度: 0.356\n",
      " - 频率: 21\n",
      "\n",
      "规则: 如果一个人买了bread，他们也会买milk\n",
      " - 置信度: 0.519\n",
      " - 频率: 14\n",
      "\n",
      "规则: 如果一个人买了bread，他们也会买apples\n",
      " - 置信度: 0.185\n",
      " - 频率: 5\n",
      "\n",
      "规则: 如果一个人买了milk，他们也会买bread\n",
      " - 置信度: 0.304\n",
      " - 频率: 14\n",
      "\n",
      "规则: 如果一个人买了milk，他们也会买apples\n",
      " - 置信度: 0.196\n",
      " - 频率: 9\n",
      "\n",
      "规则: 如果一个人买了apples，他们也会买bread\n",
      " - 置信度: 0.139\n",
      " - 频率: 5\n",
      "\n",
      "规则: 如果一个人买了apples，他们也会买milk\n",
      " - 置信度: 0.250\n",
      " - 频率: 9\n",
      "\n",
      "规则: 如果一个人买了bread，他们也会买cheese\n",
      " - 置信度: 0.148\n",
      " - 频率: 4\n",
      "\n",
      "规则: 如果一个人买了cheese，他们也会买bread\n",
      " - 置信度: 0.098\n",
      " - 频率: 4\n",
      "\n",
      "规则: 如果一个人买了milk，他们也会买bananas\n",
      " - 置信度: 0.413\n",
      " - 频率: 19\n",
      "\n",
      "规则: 如果一个人买了bananas，他们也会买milk\n",
      " - 置信度: 0.322\n",
      " - 频率: 19\n",
      "\n",
      "规则: 如果一个人买了bread，他们也会买bananas\n",
      " - 置信度: 0.630\n",
      " - 频率: 17\n",
      "\n",
      "规则: 如果一个人买了bananas，他们也会买bread\n",
      " - 置信度: 0.288\n",
      " - 频率: 17\n",
      "\n",
      "规则: 如果一个人买了milk，他们也会买cheese\n",
      " - 置信度: 0.152\n",
      " - 频率: 7\n",
      "\n",
      "规则: 如果一个人买了cheese，他们也会买milk\n",
      " - 置信度: 0.171\n",
      " - 频率: 7\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for productA, productB in confidence:\n",
    "    productA_name = features[productA]\n",
    "    productB_name = features[productB]\n",
    "    print(\"规则: 如果一个人买了{0}，他们也会买{1}\".format(productA_name, productB_name))\n",
    "    print(\" - 置信度: {0:.3f}\".format(confidence[(productA, productB)]))\n",
    "    print(\" - 频率: {0}\".format(frequency[(productA, productB)]))\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_rule(productA, productB, frequency, confidence, features):\n",
    "    productA_name = features[productA]\n",
    "    productB_name = features[productB]\n",
    "    print(\"规则: 如果一个人买了{0}，他们也会买{1}\".format(productA_name, productB_name))\n",
    "    print(\" - 置信度: {0:.2f}%\".format(100 * confidence[(productA, productB)]))\n",
    "    print(\" - 频率: {0}\".format(frequency[(productA, productB)]))\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "规则: 如果一个人买了apples，他们也会买bananas\n",
      " - 置信度: 58.33%\n",
      " - 频率: 21\n",
      "\n"
     ]
    }
   ],
   "source": [
    "productA = 3\n",
    "productB = 4\n",
    "print_rule(productA, productB, frequency, confidence, features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[((0, 1), 14),\n",
      " ((1, 2), 7),\n",
      " ((3, 2), 25),\n",
      " ((1, 3), 9),\n",
      " ((0, 2), 4),\n",
      " ((3, 0), 5),\n",
      " ((4, 1), 19),\n",
      " ((3, 1), 9),\n",
      " ((1, 4), 19),\n",
      " ((2, 4), 27),\n",
      " ((2, 0), 4),\n",
      " ((2, 3), 25),\n",
      " ((2, 1), 7),\n",
      " ((4, 3), 21),\n",
      " ((0, 4), 17),\n",
      " ((4, 2), 27),\n",
      " ((1, 0), 14),\n",
      " ((3, 4), 21),\n",
      " ((0, 3), 5),\n",
      " ((4, 0), 17)]\n"
     ]
    }
   ],
   "source": [
    "# Sort by support\n",
    "from pprint import pprint\n",
    "pprint(list(support.items()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "from operator import itemgetter\n",
    "sorted_frequency = sorted(frequency.items(), key=itemgetter(1), reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "规则 #1\n",
      "规则: 如果一个人买了cheese，他们也会买bananas\n",
      " - 置信度: 65.85%\n",
      " - 频率: 27\n",
      "\n",
      "规则 #2\n",
      "规则: 如果一个人买了bananas，他们也会买cheese\n",
      " - 置信度: 45.76%\n",
      " - 频率: 27\n",
      "\n",
      "规则 #3\n",
      "规则: 如果一个人买了cheese，他们也会买apples\n",
      " - 置信度: 60.98%\n",
      " - 频率: 25\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for index in range(3):\n",
    "    print(\"规则 #{0}\".format(index + 1))\n",
    "    (productA, productB) = sorted_frequency[index][0]\n",
    "    print_rule(productA, productB, frequency, confidence, features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "规则 #1\n",
      "规则: 如果一个人买了apples，他们也会买cheese\n",
      " - 置信度: 69.44%\n",
      " - 频率: 25\n",
      "\n",
      "规则 #2\n",
      "规则: 如果一个人买了cheese，他们也会买bananas\n",
      " - 置信度: 65.85%\n",
      " - 频率: 27\n",
      "\n",
      "规则 #3\n",
      "规则: 如果一个人买了bread，他们也会买bananas\n",
      " - 置信度: 62.96%\n",
      " - 频率: 17\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for index in range(3):\n",
    "    print(\"规则 #{0}\".format(index + 1))\n",
    "    (productA, productB) = sorted_confidence[index][0]\n",
    "    print_rule(productA, productB, frequency, confidence, features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
